Qualitative Analysis and Machine Learning
Photo by Daryan Shamkhali on Unsplash
…75 years since its founding, the General Assembly remains the global forum where all countries have a voice.
— Antonio Guterres, United Nations Secretary-General, 10 January 2021
Multidimensional scaling is a technique that aims to visualize the level of similarity among individual points in a dataset.
The resulting visual is a scatter plot where data points appear to attract and repeal each other within the same quadrant.
This is a dataset of member states roll-call votes in the UN General Assembly from 1946 to 2020 (sessions 1 to 75).
load("archetypes/un-votes.RData")
completeVotes
summary <- completeVotes %>%
count(unres, resid, year, session, abstain, yes, no, date, short, descr, me, nu, di, hr, co, ec)
summary
df <- completeVotes %>%
filter(ec == 1, !vote == 9, nchar(Country) == 3) %>%
select(vote, Country, Countryname, year, resid) %>%
mutate(digit_vote = case_when(
vote == 1 ~ 2,
#vote == 2 ~ 1,
vote == 3 ~ 0,
TRUE ~ 1
))
df_1 <- df %>%
group_by(Country, resid) %>%
filter(n()==1)
df_2 <- df_1 %>%
select(Country, resid, digit_vote) %>%
pivot_wider(., names_from=resid, values_from=digit_vote) %>%
replace(is.na(.), 1)
df_2
df_3 <- df_2 %>%
select(-`Country`)
# Compute MDS
mds <- df_3 %>%
dist() %>%
cmdscale() %>%
as_tibble()
mds
This is not absolutelly necessary, but is a nice we to visually highlight groups,
# K-means clustering
clust <- kmeans(mds, 5)$cluster %>%
as.factor()
mds <- mds %>%
mutate(groups = clust)
df_4 <- data.frame(country = df_3$Country,
dim1 = mds$V1,
dim2 = mds$V2,
groups = mds$groups)
df_4
using ggscatter plot
# Plot and color by groups
g3 <- ggscatter(df_4, x = "dim1", y = "dim2",
label = df_4$country,
color = "groups",
palette = "jco",
size = 1,
ellipse = TRUE,
ellipse.type = "convex",
repel = TRUE,
interactive = TRUE)
girafe(ggobj = g3, width_svg = 13, height_svg = 7,
options = list(opts_sizing(rescale = TRUE, width = 1.0)))
country_displayed = c("China", "France", "United States of America",
"United Kingdom of Great Britain and Northern Ireland", "Russian Federation",
"Israel", "iran_Iran (Islamic Republic of)", "India", "Pakistan", "Ethiopia")
df_ref <- read_csv("archetypes/CountryRefData.csv")
df_joined <- inner_join(df_4, df_ref, by=c("country" = "CountryCode")) %>%
mutate(groups = as_factor(groups)) %>%
mutate(point_size = if_else(country == "ETH", 50, 40)) %>%
mutate(point_alpha = if_else(country == "ETH", 1, 1))
g1 <- ggplot(df_joined, aes(x = dim1, y = dim2)) +
geom_point_interactive(
aes(tooltip = paste0(CountryName,"\n", Region),
color = groups, size = point_size, alpha = point_alpha))+
geom_text_repel(aes(label = CountryName),
data = df_joined[df_joined$CountryName %in% country_displayed,], size = 10) +
theme_tufte(base_size = 15) +
theme(
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position="none",
plot.margin = unit(c(1, 5, 1, 1), "lines"),
)
girafe(ggobj = g1, width_svg = 13, height_svg = 7,
options = list(opts_sizing(rescale = TRUE, width = 1.0)))